library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggfortify)
library(fastDummies)
library(mosaic)## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
houses <- read_csv("data/housing_prices.csv")## Rows: 19675 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ocean_proximity
## dbl (9): longitude, latitude, housing_median_age, total_rooms, total_bedroom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(houses)## longitude latitude housing_median_age total_rooms
## Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 2
## 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00 1st Qu.: 1438
## Median :-118.5 Median :34.27 Median :28.00 Median : 2111
## Mean :-119.6 Mean :35.65 Mean :28.39 Mean : 2620
## 3rd Qu.:-118.0 3rd Qu.:37.73 3rd Qu.:37.00 3rd Qu.: 3120
## Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320
##
## total_bedrooms population households median_income
## Min. : 2.0 Min. : 3 Min. : 2.0 Min. : 0.4999
## 1st Qu.: 297.0 1st Qu.: 796 1st Qu.: 282.0 1st Qu.: 2.5268
## Median : 436.0 Median : 1179 Median : 411.0 Median : 3.4500
## Mean : 539.6 Mean : 1441 Mean : 501.2 Mean : 3.6767
## 3rd Qu.: 648.0 3rd Qu.: 1746 3rd Qu.: 606.0 3rd Qu.: 4.5826
## Max. :6445.0 Max. :35682 Max. :6082.0 Max. :15.0001
## NA's :200
## median_house_value ocean_proximity
## Min. : 14999 Length:19675
## 1st Qu.:116600 Class :character
## Median :173800 Mode :character
## Mean :192478
## 3rd Qu.:248200
## Max. :500000
##
glimpse(houses)## Rows: 19,675
## Columns: 10
## $ longitude <dbl> -122.23, -122.22, -122.24, -122.25, -122.25, -122.2…
## $ latitude <dbl> 37.88, 37.86, 37.85, 37.85, 37.85, 37.85, 37.84, 37…
## $ housing_median_age <dbl> 41, 21, 52, 52, 52, 52, 52, 52, 42, 52, 52, 52, 52,…
## $ total_rooms <dbl> 880, 7099, 1467, 1274, 1627, 919, 2535, 3104, 2555,…
## $ total_bedrooms <dbl> 129, 1106, 190, 235, 280, 213, 489, 687, 665, 707, …
## $ population <dbl> 322, 2401, 496, 558, 565, 413, 1094, 1157, 1206, 15…
## $ households <dbl> 126, 1138, 177, 219, 259, 193, 514, 647, 595, 714, …
## $ median_income <dbl> 8.3252, 8.3014, 7.2574, 5.6431, 3.8462, 4.0368, 3.6…
## $ median_house_value <dbl> 452600, 358500, 352100, 341300, 342200, 269700, 299…
## $ ocean_proximity <chr> "NEAR BAY", "NEAR BAY", "NEAR BAY", "NEAR BAY", "NE…
skimr::skim(houses)| Name | houses |
| Number of rows | 19675 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ocean_proximity | 0 | 1 | 6 | 10 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| longitude | 0 | 1.00 | -119.56 | 2.01 | -124.35 | -121.76 | -118.50 | -117.99 | -114.31 | ▂▆▃▇▁ |
| latitude | 0 | 1.00 | 35.65 | 2.15 | 32.54 | 33.93 | 34.27 | 37.73 | 41.95 | ▇▁▅▂▁ |
| housing_median_age | 0 | 1.00 | 28.39 | 12.51 | 1.00 | 18.00 | 28.00 | 37.00 | 52.00 | ▃▇▇▇▅ |
| total_rooms | 0 | 1.00 | 2619.76 | 2181.35 | 2.00 | 1438.00 | 2111.00 | 3120.00 | 39320.00 | ▇▁▁▁▁ |
| total_bedrooms | 200 | 0.99 | 539.65 | 422.41 | 2.00 | 297.00 | 436.00 | 648.00 | 6445.00 | ▇▁▁▁▁ |
| population | 0 | 1.00 | 1440.81 | 1143.65 | 3.00 | 796.00 | 1179.00 | 1746.00 | 35682.00 | ▇▁▁▁▁ |
| households | 0 | 1.00 | 501.19 | 383.26 | 2.00 | 282.00 | 411.00 | 606.00 | 6082.00 | ▇▁▁▁▁ |
| median_income | 0 | 1.00 | 3.68 | 1.57 | 0.50 | 2.53 | 3.45 | 4.58 | 15.00 | ▇▇▁▁▁ |
| median_house_value | 0 | 1.00 | 192477.92 | 97711.51 | 14999.00 | 116600.00 | 173800.00 | 248200.00 | 500000.00 | ▅▇▅▂▁ |
There are 10 variables, 9 are numeric and at a first glance appear to be continuous.
total_bedrooms is missing 200 observations, none of the other columns contain missing data.
ggpairs(houses, columns = c("total_rooms", "total_bedrooms"))## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 200 rows containing missing values
## Warning: Removed 200 rows containing missing values (`geom_point()`).
## Warning: Removed 200 rows containing non-finite values (`stat_density()`).
The correlation value is 0.934 which suggests total_rooms and total_bedrooms are very strongly correlated. The relationship is evident in the scatter plot too.
houses_trim <- houses %>%
select(-total_bedrooms)ggpairs(houses_trim, progress = FALSE)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Significant correlations with median_house_value median_income; cor = 0.643
Possible correlations: latitude; cor = -0.148 (could indicate closer to sea) total_rooms; cor = 0.143 ocean_proximity; boxplot indicates one category has higher prices and one category has lower prices
median_income
ggplot(houses, aes(median_income, median_house_value)) +
geom_point(alpha = 0.1)latitude
ggplot(houses, aes(latitude, median_house_value)) +
geom_point(alpha = 0.1)This is interesting and latitude probably needs to be taken into consideration with longitude, otherwise it is like you are taking a line across the area and expecting all the houses to cost the same.
total_rooms
ggplot(houses, aes(total_rooms, median_house_value)) +
geom_point(alpha = 0.1)Surely the total rooms is highly dependent on the number of households - so if you calculate mean_room_per_household it might mean something.
ocean_proximity
ggplot(houses, aes(ocean_proximity, median_house_value)) +
geom_boxplot()Island appears to have the effect of increasing median house value while inland has the opposite effect and decreases the value.
Ocean proximity has 5 levels so we would expect 4 dummy variables
model <- lm(
median_house_value ~ median_income,
data = houses
)summary(model)##
## Call:
## lm(formula = median_house_value ~ median_income, data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -513966 -51564 -13979 36549 403935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45457.0 1359.0 33.45 <2e-16 ***
## median_income 39987.0 339.9 117.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74870 on 19673 degrees of freedom
## Multiple R-squared: 0.4129, Adjusted R-squared: 0.4129
## F-statistic: 1.384e+04 on 1 and 19673 DF, p-value: < 2.2e-16
The p-value is very low but we still need to check its valid. The R2 is 41% which is reasonable.
autoplot(model)Plot 1: Its hard ot interpret plot because of the large number of points but there appears to be some sort of upper limit for residuals depending on the fitted value.
Plot 2: A lot of the values are along the line but there is alos a bit of deviation
Plot 3: The upper limit observed in plot 1 is visible again here. This looks like heteroskewdasticity.
ggplot(houses, aes(median_income, median_house_value)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm")## `geom_smooth()` using formula = 'y ~ x'
ggplot(houses) +
geom_density(aes(x = median_house_value))The data is capped at 500000. There is some discussion about it on Kaggle but they discuss a large number of houses being entered as 50001. This data appears to have been removed from the dataset we are using. The dataset on Kaggle has 20640 rows, the one we are using has 19675. The Kaggle discussion mentions 965 rows at 50001 median house value so these number make sense.
Ok so if we are ok with this upper limit then the diagnostics look ok and the p-value is low so median income is significant in terms of our model.
Going to add ocean proximity as it seems to be the other main variable that has an effect.
First need to convert to dummy columns
houses_dummy <- houses %>%
dummy_cols(select_columns = "ocean_proximity",
remove_first_dummy = TRUE) %>%
janitor::clean_names() %>%
mutate(across(ocean_proximity_inland:ocean_proximity_near_ocean, as.logical))island and inland were kept as columns. The less than 1 h from ocean column was removed as the dummy column. Deliberately left the ocean_proximity column in.
Now to build model including the island variable.
model_2 <- lm(
median_house_value ~ median_income + ocean_proximity_island,
data = houses_dummy
)summary(model_2)##
## Call:
## lm(formula = median_house_value ~ median_income + ocean_proximity_island,
## data = houses_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -514154 -51494 -13940 36548 404045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45320.1 1357.6 33.382 < 2e-16 ***
## median_income 40008.7 339.6 117.828 < 2e-16 ***
## ocean_proximity_islandTRUE 225319.3 33449.9 6.736 1.67e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74780 on 19672 degrees of freedom
## Multiple R-squared: 0.4143, Adjusted R-squared: 0.4142
## F-statistic: 6958 on 2 and 19672 DF, p-value: < 2.2e-16
R2 is still 41% so no big improvement there The p-values of island is low (lets check the diagnostics)
autoplot(model_2)The plots all look similar to when it was just median_income
plotModel(model_2)Can see now why the island category has a different median_house_value but doesn’t change model much - there aren’t many observations for this category.
houses_dummy %>%
filter(ocean_proximity_island == TRUE) %>%
summarise(total = n())Oh dear, only 5 observations for island. Lets check the other categories.
houses_dummy %>%
group_by(ocean_proximity) %>%
summarise(total = n())The other variable category which had an effect was ocean proximity = inland and there are 6524 observations for this so lets try this instead.
model_3 <- lm(
median_house_value ~ median_income + ocean_proximity_inland,
data = houses_dummy
)summary(model_3)##
## Call:
## lm(formula = median_house_value ~ median_income + ocean_proximity_inland,
## data = houses_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -481828 -42252 -11834 28057 376282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90207.1 1328.1 67.92 <2e-16 ***
## median_income 34861.2 305.7 114.02 <2e-16 ***
## ocean_proximity_inlandTRUE -78120.5 1019.7 -76.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65710 on 19672 degrees of freedom
## Multiple R-squared: 0.5478, Adjusted R-squared: 0.5478
## F-statistic: 1.192e+04 on 2 and 19672 DF, p-value: < 2.2e-16
The R2 has improved a bit and is now 55% (up from 41%). The p-value is low but we haven’t checked diagnostics yet The rse seems quite high at 65710
autoplot(model_3)Plot 1: It looks like a lot more points above the line but it is hard to tell when there are so many.
Plot 2: looks ok, not obviously better or worse than for median_income alone
Plot 3: Really looks like a funnel shape but is it just caused by the data being cut off at 50000?
plotModel(model_3, alpha = 0.1)plotModel(model_3) + facet_wrap( ~ ocean_proximity_inland)ocean_proximity_inland is significant in terms of our model. When it is TRUE it has the effect of lowering the median_house_value.
Try adding an interaction between log(median_income) and your chosen categorical predictor. Do you think this interaction term is statistically justified?
model_interactions <- lm(
formula = median_house_value ~ median_income + ocean_proximity_inland + log(median_income):ocean_proximity_inland,
data = houses_dummy
)summary(model_interactions)##
## Call:
## lm(formula = median_house_value ~ median_income + ocean_proximity_inland +
## log(median_income):ocean_proximity_inland, data = houses_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -513943 -41636 -12761 27592 370836
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 90751 1928 47.065
## median_income 39741 1051 37.822
## ocean_proximity_inlandTRUE -67708 2882 -23.493
## ocean_proximity_inlandFALSE:log(median_income) -15373 3987 -3.855
## ocean_proximity_inlandTRUE:log(median_income) -24811 3714 -6.680
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## median_income < 2e-16 ***
## ocean_proximity_inlandTRUE < 2e-16 ***
## ocean_proximity_inlandFALSE:log(median_income) 0.000116 ***
## ocean_proximity_inlandTRUE:log(median_income) 2.45e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65620 on 19670 degrees of freedom
## Multiple R-squared: 0.549, Adjusted R-squared: 0.5489
## F-statistic: 5987 on 4 and 19670 DF, p-value: < 2.2e-16
I don’t understand why there is a TRUE and FALSE value for the interaction - we didn’t get this in the class example?
p-values are very low R1 is 55% so lower than for just median_income and ocean_proximity_inland
plotModel(model_interactions)## Warning in log(median_income): NaNs produced
## Warning: Removed 4 rows containing missing values (`geom_line()`).
autoplot(model_interactions)I can’ see any different in these plots compared to any of the others I’ve run??
Try it on its own anyway…
model_interactions2 <- lm(
formula = median_house_value ~ log(median_income):ocean_proximity_inland,
data = houses_dummy
)summary(model_interactions2)##
## Call:
## lm(formula = median_house_value ~ log(median_income):ocean_proximity_inland,
## data = houses_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -293005 -44103 -14029 30017 421277
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 46494 1442 32.24
## log(median_income):ocean_proximity_inlandFALSE 139514 1106 126.20
## log(median_income):ocean_proximity_inlandTRUE 75280 1372 54.87
## Pr(>|t|)
## (Intercept) <2e-16 ***
## log(median_income):ocean_proximity_inlandFALSE <2e-16 ***
## log(median_income):ocean_proximity_inlandTRUE <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68370 on 19672 degrees of freedom
## Multiple R-squared: 0.5105, Adjusted R-squared: 0.5105
## F-statistic: 1.026e+04 on 2 and 19672 DF, p-value: < 2.2e-16
autoplot(model_interactions2)plotModel(model_interactions2)## Warning in log(median_income): NaNs produced
## Warning: Removed 4 rows containing missing values (`geom_line()`).
ggplot(houses_dummy, aes(log(median_income), median_house_value,
colour = ocean_proximity_inland, alpha = 0.1)) +
geom_point() +
geom_smooth(method = "lm")## `geom_smooth()` using formula = 'y ~ x'
I don’t know whether its statistically justified. The diagnostic plots look the same for every combination I’ve run, except for the shift along the x-axis. Either I’m doing something wrong or this is not a good example to be working with. I feel like I know less that when I started.
houses_dummy <- houses_dummy %>%
mutate(mean_rooms = total_rooms / households, .after = total_rooms)model_4 <- lm(
median_house_value ~ median_income + ocean_proximity_inland + mean_rooms,
data = houses_dummy
)summary(model_4)##
## Call:
## lm(formula = median_house_value ~ median_income + ocean_proximity_inland +
## mean_rooms, data = houses_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -482512 -42251 -11797 28051 376245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90904.4 1476.6 61.56 <2e-16 ***
## median_income 34997.2 330.7 105.84 <2e-16 ***
## ocean_proximity_inlandTRUE -77807.2 1060.2 -73.39 <2e-16 ***
## mean_rooms -242.8 224.7 -1.08 0.28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65710 on 19671 degrees of freedom
## Multiple R-squared: 0.5479, Adjusted R-squared: 0.5478
## F-statistic: 7945 on 3 and 19671 DF, p-value: < 2.2e-16
Wow this is not significant at all - interesting. You would really think that the number of rooms in a house would affect the price but I suppose it maybe doesn’t account for really expensive 1 bed flats in the city?